From 1e86dc5e2bb07b999c406aa6d7774a20132ecb67 Mon Sep 17 00:00:00 2001 From: Lumpiasty Date: Sun, 21 Jun 2026 02:00:32 +0200 Subject: [PATCH] Detect GPON blackhole using ping --- ansible/roles/routeros/tasks/routing.yml | 38 +++++++++++++++++++++--- ansible/roles/routeros/tasks/wan.yml | 1 + docs/lte-failover-design.md | 16 +++++----- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/ansible/roles/routeros/tasks/routing.yml b/ansible/roles/routeros/tasks/routing.yml index 3281c6c..9e4e928 100644 --- a/ansible/roles/routeros/tasks/routing.yml +++ b/ansible/roles/routeros/tasks/routing.yml @@ -12,15 +12,44 @@ scope: 30 suppress-hw-offload: false target-scope: 10 - - disabled: false + - comment: GPON Monitor 1 + disabled: false + distance: 1 + dst-address: 1.0.0.1/32 + gateway: pppoe-gpon + routing-table: main + scope: 10 + suppress-hw-offload: false + target-scope: 10 + - comment: GPON Monitor 2 + disabled: false + distance: 1 + dst-address: 8.8.4.4/32 + gateway: pppoe-gpon + routing-table: main + scope: 10 + suppress-hw-offload: false + target-scope: 10 + - comment: GPON Default 1 + disabled: false distance: 1 dst-address: 0.0.0.0/0 - gateway: pppoe-gpon + gateway: 1.0.0.1 + check-gateway: ping routing-table: main scope: 30 suppress-hw-offload: false - target-scope: 10 - vrf-interface: pppoe-gpon + target-scope: 11 + - comment: GPON Default 2 + disabled: false + distance: 2 + dst-address: 0.0.0.0/0 + gateway: 8.8.4.4 + check-gateway: ping + routing-table: main + scope: 30 + suppress-hw-offload: false + target-scope: 11 handle_absent_entries: remove handle_entries_content: remove_as_much_as_possible @@ -32,6 +61,7 @@ distance: 1 dst-address: 2000::/3 gateway: 2001:470:70:dd::1 + check-gateway: ping scope: 30 target-scope: 10 - comment: Tailnet diff --git a/ansible/roles/routeros/tasks/wan.yml b/ansible/roles/routeros/tasks/wan.yml index e80d608..27ec862 100644 --- a/ansible/roles/routeros/tasks/wan.yml +++ b/ansible/roles/routeros/tasks/wan.yml @@ -10,6 +10,7 @@ password: "{{ routeros_pppoe_password }}" # Using CoreDNS container with DNS64 use-peer-dns: false + add-default-route: false user: "{{ routeros_pppoe_username }}" handle_absent_entries: remove handle_entries_content: remove_as_much_as_possible diff --git a/docs/lte-failover-design.md b/docs/lte-failover-design.md index 82fdf6d..68f5525 100644 --- a/docs/lte-failover-design.md +++ b/docs/lte-failover-design.md @@ -84,9 +84,10 @@ subnets would fail routing lookup with "net unreachable" without it. | Destination | Source | Distance | Active when | |---|---|---|---| -| `0.0.0.0/0` | static via `pppoe-gpon` | 1 | GPON up | +| `1.0.0.1/32`, `8.8.4.4/32` | static via `pppoe-gpon` | 1 | always | +| `0.0.0.0/0` | static via `1.0.0.1`, `8.8.4.4` (recursive) | 1, 2 | GPON ping check succeeds | | `0.0.0.0/0` | BGP from D-Link via `192.168.6.2` | 200 | wwan up on D-Link | -| `2000::/3` | static via `sit1` (HE tunnel) | 1 | sit1 active (HE tunnel works) | +| `2000::/3` | static via `2001:470:70:dd::1` (HE tunnel) | 1 | HE tunnel ping check succeeds | | `2000::/3` | BGP from D-Link via `2001:470:61a3:600::2` | 200 | wwan up on D-Link | RouterOS distance comparison is straightforward: distance 1 always wins @@ -136,11 +137,12 @@ preferred route for D-Link's own traffic. - **wwan modem goes down** → BIRD2 device protocol detects wwan0 down → static `lte_default` / `lte_default6` become unreachable → BGP withdraws announcements → CRS removes BGP-learned default -- **GPON drops** → `pppoe-gpon` interface down → CRS distance-1 default - route inactive → distance-200 BGP route activates → CRS withdraws its - default-originate announcement to D-Link (since no default is installed - any more) → D-Link's kernel default-via-CRS is removed → D-Link uses - wwan kernel default → traffic flows from CRS via vlan6 → D-Link → wwan +- **GPON drops or blackholes** → recursive ping checks (1.0.0.1, 8.8.4.4) over `pppoe-gpon` + fail (takes ~20s: 10s ping interval + 10s timeout) → CRS distance-1/2 default routes inactive → distance-200 BGP route + activates → CRS withdraws its default-originate announcement to D-Link (loop + prevention prevents reflecting D-Link's own route) → D-Link's kernel + default-via-CRS is removed → D-Link uses wwan kernel default → traffic flows + from CRS via vlan6 → D-Link → wwan All transitions are automatic and driven by interface state. No active probing (Netwatch / mwan3), no scripts toggling routes.